###############################################################################
# Plot 1
###############################################################################
# This Script contains the code to build the first figure
###############################################################################
# Content
###############################################################################
# 1) Dependencies
# 2) Load Data
# 3) Aggregation for Fig. 1
# 4) Fig. 1
###############################################################################
# 1) Dependencies
###############################################################################
library(readr)
library(dplyr)
library(ggplot2)
library(gganimate)
library(ggeffects)
library(ggExtra)
library(ggridges)
library(ggrepel)
library(ggpubr)
library(grid)
library(scales)
library(lubridate)
library(extrafont)
library(reshape2)
library(here)
library(ggforce)
library(png)
library(readxl)
library(grid)
library(gridExtra)
library(ggpubr)
library(ggalt)
library(stringr)
###############################################################################
# 2) Load Data
###############################################################################
# Set Path
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
rm(list=ls())

# Custom functions
# ggplot rescale x axis....
scale_x_reordered <- function(..., sep = "___") {
  reg <- paste0(sep, ".+$")
  ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}
# ggplot order over facets...
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
  new_x <- paste(x, within, sep = sep)
  stats::reorder(new_x, by, FUN = fun)
}

suppressWarnings(source('ggplot_theme_ddl.R', encoding = "UTF-8"))

# Load Data
df <- readRDS("../data/smd_ner_2015_2019_combined.RDS")
candidates_list_15 <- read.csv('../support/candidates-2015/00-Named_Entity_List_withID.csv', stringsAsFactors = F) %>% 
  as_tibble %>% mutate(id=as.character(id))
candidates_list_19 <- read.csv('../support/candidates-2019/00-Named_Entity_List_withID.csv', stringsAsFactors = F) %>% 
  as_tibble %>% mutate(id=as.character(id))

candidates_list_19 <- candidates_list_19 %>% mutate(candidacy = as.character(gsub("\\s", " ", council))) %>% 
  mutate(council = case_when(candidacy %in% c("SR", "Former Staenderat", "Former Staenderat") ~ "sr",
                             candidacy %in% c("NR", "Former Nationalrat", "Former Nationalrat") ~  "nr",
                             candidacy %in% c("SR und NR", "NR und SR") ~ "sr & nr")) %>% 
  dplyr::select(-c(candidacy))

df$year <- as.character(df$year)
df$date <- format(as.Date(df$date, "%m-%d"), format = "%m-%d")
df$fullname <- ifelse(df$fullname == "Adèle Goumaz", "Adèle Thorens Goumaz", df$fullname)
df$fullname <- ifelse(df$fullname == "Niklaus-Samuel Gugger", "Nik Gugger", df$fullname)
df$incumbent <- ifelse(df$fullname == "Philipp Müller", 1, df$incumbent)

# Remove Federal Councilors
council <- TRUE
# Remove Party Presidents
president <- TRUE

# Council members 2015
council_15 <- c("Ueli Maurer", "Alain Berset", "Didier Burkhalter",
                "Simonetta Sommaruga", "Eveline Widmer Schlumpf",
                "Johann Schneider-Ammann", "Doris Leuthard")

# Council members 2019
council_19 <- c("Ueli Maurer", "Alain Berset", "Ignazio Cassis",
                "Simonetta Sommaruga", "Guy Parmelin",
                "Karin Keller-Sutter", "Viola Amherd")

# Party Presidents 2015
presi_15 <- c("Toni Brunner", "Christian Levrat", "Philipp Müller",
              "Christophe Darbellay", "Regula Rytz", "Martin Bäumle", "Martin Landolt")

# Party Presidents 2019
presi_19 <- c("Albert Rösti", "Christian Levrat", "Petra Gössi",
              "Gerhard Pfister", "Regula Rytz", "Jürg Grossen", "Martin Landolt")


# Remove Council Members:
if(council == T){
  df <- df %>% dplyr::filter((year == "2015" & !fullname %in% council_15) |
                               (year == "2019" & !fullname %in% council_19))
}
# Remove Party Presidents:
if(president == T){
  df <- df %>% dplyr::filter((year == "2015" & !fullname %in% presi_15) |
                               (year == "2019" & !fullname %in% presi_19))
}
###############################################################################
# 3) Aggregation for Fig. 1 Part A
###############################################################################
names(df)

df$n_candidates_on_list <- ifelse(df$year == "2019", 3599, 2888)

df <- df %>% mutate(n_candidates_on_list = case_when(year == "2019" & incumbent == 1 ~ 186,
                                                     year == "2019" & incumbent == 0 ~ 3599,
                                                     year == "2015" & incumbent == 1 ~ 200,
                                                     year == "2015" & incumbent == 0 ~ 2888
))

agg1 <- df %>% dplyr::group_by(year, gender, n_candidates_on_list, fullname, person.id) %>%
               dplyr::summarise(n = n()) %>% filter(!is.na(fullname) == T)

helper <- filter(agg1, year == "2015")
candidates_list_15_a <- candidates_list_15 %>% filter(!id %in% helper$person.id)

helper <- filter(agg1, year == "2019")
candidates_list_19_a <- candidates_list_19 %>% filter(!id %in% helper$person.id)

candidates_list_15_a <- candidates_list_15_a %>% dplyr::select(c(fullname,gender,id)) %>% 
  dplyr::mutate(person.id = id,
                n = 0,
                year = "2015") %>% 
  dplyr::select(-c("id"))

candidates_list_19_a <- candidates_list_19_a %>% dplyr::select(c(fullname,gender,id)) %>% 
  dplyr::mutate(person.id = id,
                n = 0,
                year = "2019") %>% 
  dplyr::select(-c("id"))


agg1 <- dplyr::bind_rows(agg1,candidates_list_15_a,candidates_list_19_a)

# Remove the Class NA
agg1 <- agg1  %>% dplyr::filter(is.na(person.id) == F)
store_agg1 <- agg1

agg1 <- agg1 %>% ungroup() %>% dplyr::group_by(year, gender) %>%
  dplyr::summarise(n_cases = n(),
            avg_n = mean(n),
            s = sd(n),
            se = sd(n)/sqrt(n_cases)) %>%
  distinct(.keep_all = T) %>% 
  dplyr::mutate(t_score = qt(p=0.975, df=(n_cases-1))) %>%
  dplyr::mutate(margin_error = t_score * se) %>% 
  dplyr::mutate(lower = avg_n - margin_error,
         upper = avg_n + margin_error)

# This formula calculates the p-values under the assumption we don't have equal variance, 
# hence we use the formula from the Welch t-test to calculate the degrees of freedom! 
# This is a bit more complex but more likely to be right given the fact that 
# the variance is not so small
compute_pval <- function(male_avg, female_avg, male_se, female_se, male_n, female_n) {
  
  t_stat <- (male_avg - female_avg) / sqrt(male_se^2 + female_se^2)
  
  df <- (male_se^2 + female_se^2)^2 / 
    (male_se^4/(male_n-1) + female_se^4/(female_n-1))
  
  p_value <- 2 * pt(-abs(t_stat), df)
  
  return(p_value)
}

# Compute p-values for each year and incumbent
p_vals <- agg1 %>%
  dplyr::group_by(year) %>%
  dplyr::summarise(
    p_value = compute_pval(
      avg_n[gender == "m"], avg_n[gender == "f"],
      se[gender == "m"], se[gender == "f"],
      n_cases[gender == "m"], n_cases[gender == "f"]
    ),
    y_position = mean(avg_n)
  )

p_vals <- p_vals %>% dplyr::mutate(y_position = ifelse(y_position < 25, 25, y_position),
                            x_position = ifelse(year == "2019", "2019", "2015"))

agg1 <- agg1[agg1$gender %in% c("m", "f"),]
agg1$gender <- ifelse(agg1$gender == "f", "Women", "Men")
###############################################################################
# 4) Fig. 1 Part A
###############################################################################
values_year <- c("2015" = "#DD2461", "2019" = "#7D7D7C")
values_gender <- c("Women" = "#DD2461", "Men" = "#7D7D7C")
shape_values_gender <- c("Women" = 16, "Men" = 17)



f1_1 <- ggplot(agg1, aes(x = reorder(year, -avg_n), y = avg_n, ymin = lower, ymax = upper, color = gender, shape = gender)) +
  geom_pointrange(position=position_dodge(0.2), fatten = 3, linewidth = 1.2, size = 1.5) +
  geom_text(data = p_vals, aes(x = x_position  , y = y_position, label = sprintf("p=%.3f", p_value)), vjust = -2.0, size = 6.5, inherit.aes = F) +
  coord_flip() +
  labs(x = "", y = "Average Number of Hits", title = "", color = "Gender:", shape = "Gender:") +
  scale_y_continuous(expand = c(0,0), limits = c(0,70), breaks = seq(0,70, by = 10)) +
  scale_color_manual(values=values_gender) +
  scale_fill_manual(values=values_gender) +
  scale_shape_manual(values=shape_values_gender) +
  ddl_theme(type = 'default',
            panel.grid.major=element_blank(),
            legend.position='none',
            axis.line.y.left = element_line(colour="black"),
            axis.line.x.bottom = element_line(colour="black")) +
  theme(legend.position = "none", legend.direction = "horizontal",
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(angle = 0, hjust = .5, vjust = 1, size = 20),
        axis.text.y = element_text(hjust=.5, size = 20),
        strip.text.x = element_text(size = 20),
        axis.title = element_text(size = 20),
        plot.title = element_text(size = 24),
        legend.text = element_text(size = 16),
        plot.margin = unit(c(.5,1.3,.5,.5), "cm"),
        legend.key.size = unit(1.5,"line"),
        axis.line.x = element_line(color="black", size = .5),
        axis.line.y = element_line(color="black", size = .5),
        panel.spacing.x=unit(2.5, "lines"))


f1_1
###############################################################################
# 5) Aggregation for Fig. 1 Part B
###############################################################################
agg1 <- df %>% dplyr::group_by(year, incumbent, gender, n_candidates_on_list, fullname, person.id) %>%
  dplyr::summarise(n = n()) %>% filter(!is.na(fullname) == T)

helper <- filter(agg1, year == "2015")
candidates_list_15_b <- candidates_list_15 %>% filter(!id %in% helper$person.id)

helper <- filter(agg1, year == "2019")
candidates_list_19_b <- candidates_list_19 %>% filter(!id %in% helper$person.id)

candidates_list_15_b <- candidates_list_15_b %>% dplyr::select(c(fullname,gender,incumbent,id)) %>% 
  dplyr::mutate(person.id = id,
                n = 0,
                year = "2015") %>% 
  dplyr::select(-c("id"))

candidates_list_19_b <- candidates_list_19_b %>% dplyr::select(c(fullname,gender,incumbent,id)) %>% 
  dplyr::mutate(person.id = id,
                n = 0,
                year = "2019") %>% 
  dplyr::select(-c("id"))


agg1 <- dplyr::bind_rows(agg1,candidates_list_15_b,candidates_list_19_b)

# Remove the Class NA
agg1 <- agg1  %>% dplyr::filter(is.na(person.id) == F)
store_agg1 <- agg1

agg1 <- agg1 %>% ungroup() %>% dplyr::group_by(year, incumbent, gender) %>%
  dplyr::summarise(n_cases = n(),
            avg_n = mean(n),
            s = sd(n),
            se = sd(n)/sqrt(n_cases)) %>%
  distinct(.keep_all = T) %>% 
  dplyr::mutate(t_score = qt(p=0.975, df=(n_cases-1))) %>%
  dplyr::mutate(margin_error = t_score * se) %>% 
  dplyr::mutate(lower = avg_n - margin_error,
         upper = avg_n + margin_error)

# This formula calculates the p-values under the assumption we don't have equal variance, 
# hence we use the formula from the Welch t-test to calculate the degrees of freedom! 
# This is a bit more complex but more likely to be right given the fact that 
# the variance is not so small
compute_pval <- function(male_avg, female_avg, male_se, female_se, male_n, female_n) {
  
  t_stat <- (male_avg - female_avg) / sqrt(male_se^2 + female_se^2)
  
  df <- (male_se^2 + female_se^2)^2 / 
    (male_se^4/(male_n-1) + female_se^4/(female_n-1))
  
  p_value <- 2 * pt(-abs(t_stat), df)
  
  return(p_value)
}

# Compute p-values for each year and incumbent
p_vals <- agg1 %>%
  dplyr::group_by(year, incumbent) %>%
  dplyr::summarise(
    p_value = compute_pval(
      avg_n[gender == "m"], avg_n[gender == "f"],
      se[gender == "m"], se[gender == "f"],
      n_cases[gender == "m"], n_cases[gender == "f"]
    ),
    y_position = mean(avg_n)
  )

p_vals <- p_vals %>% dplyr::mutate(y_position = ifelse(y_position < 25, 25, y_position),
                                   x_position = ifelse(incumbent == 1, "Incumbent", "Challenger"))

agg1 <- agg1[agg1$gender %in% c("m", "f"),]
agg1$gender <- ifelse(agg1$gender == "f", "Women", "Men")
agg1$incumbent <- ifelse(agg1$incumbent=="1","Incumbent","Challenger")
###############################################################################
# 6) Fig. 1 Part B
###############################################################################
values_year <- c("2015" = "#DD2461", "2019" = "#7D7D7C")
values_gender <- c("Women" = "#DD2461", "Men" = "#7D7D7C")
shape_values_gender <- c("Women" = 16, "Men" = 17)



f1_2 <- ggplot(agg1, aes(x = incumbent,1, y = avg_n, ymin = lower, ymax = upper, color = gender, shape = gender)) +
  geom_pointrange(position=position_dodge(0.2), fatten = 3, linewidth = 1.2, size = 1.5) +
  geom_text(data = p_vals, aes(x = x_position  , y = y_position + 1, label = sprintf("p=%.3f", p_value)), vjust = -2.0, size = 6.5, inherit.aes = F) +
  coord_flip() +
  facet_wrap(~year, ncol = 1) +
  labs(x = "", y = "Average Number of Hits", title = "", color = "Gender:", shape = "Gender:") +
  scale_y_continuous(expand = c(0,0), limits = c(0,750), breaks = seq(0,700, by = 50)) +
  scale_color_manual(values=values_gender) +
  scale_fill_manual(values=values_gender) +
  scale_shape_manual(values=shape_values_gender) +
  ddl_theme(type = 'default',
            panel.grid.major=element_blank(),
            legend.position='none',
            axis.line.y.left = element_line(colour="black"),
            axis.line.x.bottom = element_line(colour="black")) +
  theme(legend.position = "bottom", legend.direction = "horizontal",
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(angle = 0, hjust = .5, vjust = 1, size = 20),
        axis.text.y = element_text(hjust=.5, size = 20),
        strip.text.x = element_text(size = 20),
        axis.title = element_text(size = 20),
        plot.title = element_text(size = 24),
        legend.text = element_text(size = 20),
        legend.title = element_text(size=20),
        plot.margin = unit(c(.5,1.3,.5,.5), "cm"),
        legend.key.size = unit(1.5,"line"),
        axis.line.x = element_line(color="black", size = .5),
        axis.line.y = element_line(color="black", size = .5),
        panel.spacing.x=unit(2.5, "lines"))


f1_2

f_fin <- cowplot::plot_grid(f1_1,f1_2, ncol = 1, rel_heights = c(1,2))
f_fin

ggsave(plot = f_fin, filename ='../img_main/figure_1.png',width=16, height=12, dpi = 300, bg = "white")
